### Code for all cleanings based on the scraped PubMed data for Analyses
## Note: 
## Running this file requires high computer memory (at least 300 GB RAM).

################################################################################
###                    Article Author Names Cleaning 
################################################################################

library(readr)
library(dplyr)
library(stringr)
library(tidyr)

setwd("YOUR WORKING DIRECTORY")

# Read the scraped data
author <- read_csv("data_cleaning/intermeidate_data/author_meta.csv")
article <- read_csv("data_cleaning/intermeidate_data/article_meta.csv")
article <- subset(article, select = c("pmid", "year"))

# Match authors with articles and keep only post-2001 articles
merged <- left_join(author, article, by = c("PMID" = "pmid"))
merged <- subset(merged, merged$year >= 2002)

# Address the name formatting issues from the raw data
name <- str_replace_all(merged$FirstName, "[[:punct:]]", " ")
name <- str_replace_all(name, "[^[:alnum:]]", " ")
name <- gsub("\\s+", " ", name)

name_L <- str_replace_all(merged$LastName, "[[:punct:]]", " ")
name_L <- str_replace_all(name_L, "[^[:alnum:]]", " ")
name_L <- gsub("\\s+", " ", name_L)

merged$FirstName_trim <- name
merged$LastName_trim <- name_L

# Distinguish authors with full names from those with initials only
test <- separate(merged, FirstName_trim, into = c('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'), sep = " ", fill = "right")
test <- separate(test, LastName_trim, into = c('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'), sep = " ", fill = "right")

merged_trim <- test
merged_trim$chara <- nchar(merged_trim$a)
merged_trim$charb <- nchar(merged_trim$b)
merged_trim$charc <- nchar(merged_trim$c)
merged_trim$chard <- nchar(merged_trim$d)
merged_trim$chare <- nchar(merged_trim$e)
merged_trim$charf <- nchar(merged_trim$f)
merged_trim$charg <- nchar(merged_trim$g)
merged_trim$charh <- nchar(merged_trim$h)
merged_trim$chari <- nchar(merged_trim$i)

merged_trim$charA <- nchar(merged_trim$A)
merged_trim$charB <- nchar(merged_trim$B)
merged_trim$charC <- nchar(merged_trim$C)
merged_trim$charD <- nchar(merged_trim$D)
merged_trim$charE <- nchar(merged_trim$E)
merged_trim$charF <- nchar(merged_trim$F)
merged_trim$charG <- nchar(merged_trim$G)
merged_trim$charH <- nchar(merged_trim$H)
merged_trim$charI <- nchar(merged_trim$I)

merged_trim[23:31][is.na(merged_trim[23:31])] = 0
merged_trim$binary_F <- ifelse(merged_trim$chara > 1 |
                                 merged_trim$charb > 1 |
                                 merged_trim$charc > 1 |
                                 merged_trim$chard > 1 |
                                 merged_trim$chare > 1 |
                                 merged_trim$charf > 1 |
                                 merged_trim$charg > 1 |
                                 merged_trim$charh > 1 |
                                 merged_trim$chari > 1, 1, 0)

merged_trim[32:40][is.na(merged_trim[32:40])] = 0
merged_trim$binary_L <- ifelse(merged_trim$charA > 1 |
                                 merged_trim$charB > 1 |
                                 merged_trim$charC > 1 |
                                 merged_trim$charD > 1 |
                                 merged_trim$charE > 1 |
                                 merged_trim$charF > 1 |
                                 merged_trim$charG > 1 |
                                 merged_trim$charH > 1 |
                                 merged_trim$charI > 1, 1, 0)

merged_trim$binary_full <- merged_trim$binary_F + merged_trim$binary_L


## Get the percentage of authors with full name in the dataset
a <-merged_trim %>% 
  count(year, binary_full) %>% 
  group_by(year) %>% 
  transmute(binary_full, Percentage=n/sum(n)*100)

# write.csv(a, "author_percent_coverage.csv")

# Keep only the authors with full name
merged_fullname <- subset(merged_trim, binary_full == 2)

# Prepare the cleaned first and last names for gender classification
merged_fullname$a[merged_fullname$chara < 2] <- NA
merged_fullname$b[merged_fullname$charb < 2] <- NA
merged_fullname$c[merged_fullname$charc < 2] <- NA
merged_fullname$d[merged_fullname$chard < 2] <- NA
merged_fullname$e[merged_fullname$chare < 2] <- NA
merged_fullname$f[merged_fullname$charf < 2] <- NA
merged_fullname$g[merged_fullname$charg < 2] <- NA
merged_fullname$h[merged_fullname$charh < 2] <- NA
merged_fullname$i[merged_fullname$chari < 2] <- NA

merged_fullname$A[merged_fullname$charA < 2] <- NA
merged_fullname$B[merged_fullname$charB < 2] <- NA
merged_fullname$C[merged_fullname$charC < 2] <- NA
merged_fullname$D[merged_fullname$charD < 2] <- NA
merged_fullname$E[merged_fullname$charE < 2] <- NA
merged_fullname$F[merged_fullname$charF < 2] <- NA
merged_fullname$G[merged_fullname$charG < 2] <- NA
merged_fullname$H[merged_fullname$charH < 2] <- NA
merged_fullname$I[merged_fullname$charI < 2] <- NA

merged_final <- unite(merged_fullname, "FirstName_final", a:i, sep = " ", na.rm = T)
merged_final <- unite(merged_final, "LastName_final", A:I, sep = " ", na.rm = T)

unique_names <- unique(merged_final[c("FirstName_final", "LastName_final")])
unique_names$ID <- seq.int(nrow(unique_names))

merged_final <- select(merged_final, -7:-27)
authors_with_id <- left_join(merged_final, unique_names, by = c("FirstName_final" = "FirstName_final", "LastName_final" = "LastName_final"))
write.csv(authors_with_id, "data_cleaning/intermeidate_data/authors_with_id.csv")

################################################################################
###            Cleaning and Creating Variables for the Analyses
################################################################################

library(data.table)
library(dplyr)
library(tidyr)
library(stringr)

### Article-author cleaning
# Read the data (!!! Change working directory if neccessary)
article <- fread("data_cleaning/intermeidate_data/article_meta.csv")
author <- fread("data_cleaning/intermeidate_data/authors_with_id.csv")
gender_match <- fread("data_cleaning/intermeidate_data/author_name_list_matched.csv")
gender_notmatch <- fread("data_cleaning/intermeidate_data/author_name_list_notmatched.csv")

article <- subset(article, select = c(pmid, journal_name, year))

# Address possible the Journal Name formatting issues
article$journal_id = unlist(lapply(article$journal_name, function(x) {paste(utf8ToInt(str_replace_all(x, c("[:blank:]" = "", "[:punct:]" = ""))), collapse = "")}))
article <- article[, .(first(year), first(journal_id)), by = pmid]
setnames(article, c("V1", "V2"), c("year", "journal_id"))

# Adding author gender info into the article level data
gender <- rbind(gender_notmatch, gender_match)
gender[gender == "", gender:= NA]

author <- subset(author, select = c(PMID, year, FirstName_final, LastName_final, ID))
author[, rank := rowid(PMID)]
author_w_gender <- merge(author, gender, by.x = "ID", by.y = "id", all.x = T)

# Create the article-leel indicators for female and male authors count
female <- author_w_gender[gender == "F", .N, by = PMID]
setnames(female, "N", "fcount")

male <- author_w_gender[gender == "M", .N, by = PMID]
setnames(male, "N", "mcount")

setorder(author_w_gender, PMID, rank)
author_w_gender[, nacount:= sum(is.na(gender)), by = PMID]
author_w_gender[, teamsize:= .N, by = PMID]

nacount_size <- author_w_gender[, .(mean(nacount), mean(teamsize)), by = PMID]
setnames(nacount_size, c("V1", "V2"), c("nacount", "teamsize"))

# Get first/last author gender
first <- author_w_gender[rank == 1, gender, by = PMID]
setnames(first, "gender", "first_author")

setorder(author_w_gender, PMID, rank)
last <- author_w_gender[, last(gender), by = PMID]
setnames(last, "V1", "last_author")

firstandlast <- merge.data.table(first, last, by.x = "PMID", by.y = "PMID", all = T)
exceptG <- merge.data.table(firstandlast, nacount_size, by.x = "PMID", by.y = "PMID", all = T)
f <- merge.data.table(exceptG, female, by = "PMID", all = T)
author_cleaned <- merge.data.table(f, male, by = "PMID", all = T)

author_cleaned <- author_cleaned[is.na(fcount), fcount:= 0]
author_cleaned <- author_cleaned[is.na(mcount), mcount:= 0]

author_article <- merge.data.table(author_cleaned, article, by.x = "PMID", by.y = "pmid", all.x = T)
author_article[first_author == "M", first_author:= 1]
author_article[first_author == "F", first_author:= 2]
author_article[last_author == "M", last_author:= 1]
author_article[last_author == "F", last_author:= 2]

### Mesh cleaning
# Read the data
descriptor <- fread("data_cleaning/intermeidate_data/descriptor_tree.csv")
mesh <- fread("data_cleaning/intermeidate_data/mesh_meta.csv")

# Identify articles with missing MeSH and add MeSH tree number info to the main dataset
mesh <- subset(mesh, select = c(PMID, DescriptorID, Descriptor_MajorTopic))
missing = subset(mesh[Descriptor_MajorTopic == "NULL"], select = -Descriptor_MajorTopic)
setnames(missing, "DescriptorID", "missing")
missing <- missing[, .(first(missing)), by = PMID]
setnames(missing, "V1", "missing")

mesh_descriptor <- merge.data.table(mesh, descriptor, by.x = "DescriptorID", by.y = "UI", all.x = T)
mesh_descriptor <- subset(mesh_descriptor, select = -V1)

for (i in 5:24) {
  mesh_descriptor[[i]] <- strtrim(mesh_descriptor[[i]], 1)
}
mesh_descriptor[Descriptor_MajorTopic == "NULL", Descriptor_MajorTopic:= NA]

# Get the article-level reserch field info based on MeSH tree number
get_field <- function(x) {
  a <- mesh_descriptor[TreeNumber1 == x |
                         TreeNumber2 == x |
                         TreeNumber3 == x |
                         TreeNumber4 == x |
                         TreeNumber5 == x |
                         TreeNumber6 == x |
                         TreeNumber7 == x |
                         TreeNumber8 == x |
                         TreeNumber9 == x |
                         TreeNumber10 == x |
                         TreeNumber11 == x |
                         TreeNumber12 == x |
                         TreeNumber13 == x |
                         TreeNumber14 == x |
                         TreeNumber15 == x |
                         TreeNumber16 == x |
                         TreeNumber17 == x |
                         TreeNumber18 == x |
                         TreeNumber19 == x |
                         TreeNumber20 == x]
  
  as.data.table(cbind(unique(a$PMID), as.numeric(rep(1, length(unique(a$PMID))))))
}

K <- get_field("K")
setnames(K, c("V1", "V2"), c("PMID", "binary_K"))

A <- get_field("A")
setnames(A, c("V1", "V2"), c("PMID", "binary_A"))

B <- get_field("B")
setnames(B, c("V1", "V2"), c("PMID", "binary_B"))

C <- get_field("C")
setnames(C, c("V1", "V2"), c("PMID", "binary_C"))

D <- get_field("D")
setnames(D, c("V1", "V2"), c("PMID", "binary_D"))

E <- get_field("E")
setnames(E, c("V1", "V2"), c("PMID", "binary_E"))

F <- get_field("F")
setnames(F, c("V1", "V2"), c("PMID", "binary_F"))

G <- get_field("G")
setnames(G, c("V1", "V2"), c("PMID", "binary_G"))

H <- get_field("H")
setnames(H, c("V1", "V2"), c("PMID", "binary_H"))

I <- get_field("I")
setnames(I, c("V1", "V2"), c("PMID", "binary_I"))

J <- get_field("J")
setnames(J, c("V1", "V2"), c("PMID", "binary_J"))

L <- get_field("L")
setnames(L, c("V1", "V2"), c("PMID", "binary_L"))

M <- get_field("M")
setnames(M, c("V1", "V2"), c("PMID", "binary_M"))

N <- get_field("N")
setnames(N, c("V1", "V2"), c("PMID", "binary_N"))

V <- get_field("V")
setnames(V, c("V1", "V2"), c("PMID", "binary_V"))

Z <- get_field("Z")
setnames(Z, c("V1", "V2"), c("PMID", "binary_Z"))

# Female Research binary
mesh_female <- mesh_descriptor[DescriptorName == "Female"]
mesh_female_pmid <- unique(mesh_female$PMID)
binary_female <- as.numeric(rep(1, length(mesh_female_pmid)))
article_female <- as.data.table(cbind(mesh_female_pmid, binary_female))
setnames(article_female, "mesh_female_pmid", "PMID")

# majortopic of female mesh term
mesh_female_majorY <- mesh_descriptor[DescriptorName == "Female" & Descriptor_MajorTopic == "Y"]
mesh_female_majorN <- mesh_descriptor[DescriptorName == "Female" & Descriptor_MajorTopic == "N"]
mesh_female_majorY_pmid <- unique(mesh_female_majorY$PMID)
mesh_female_majorN_pmid <- unique(mesh_female_majorN$PMID)
mesh_female_major_pmid <- append(mesh_female_majorY_pmid, mesh_female_majorN_pmid)

binary_female_majorY <- as.numeric(rep(1, length(mesh_female_majorY_pmid)))
binary_female_majorN <- as.numeric(rep(0, length(mesh_female_majorN_pmid)))
binary_female_major <- append(binary_female_majorY, binary_female_majorN)
article_female_major <- as.data.table(cbind(mesh_female_major_pmid, binary_female_major))
setnames(article_female_major, "mesh_female_major_pmid", "PMID")

article_female <- merge.data.table(article_female, article_female_major, by = "PMID", all = T)

# Male Research binary
mesh_male <- mesh_descriptor[DescriptorName == "Male"]
mesh_male_pmid <- unique(mesh_male$PMID)
binary_male <- as.numeric(rep(1, length(mesh_male_pmid)))
article_male <- as.data.table(cbind(mesh_male_pmid, binary_male))
setnames(article_male, "mesh_male_pmid", "PMID")

#majortopic of male mesh term
mesh_male_majorY <- mesh_descriptor[DescriptorName == "Male" & Descriptor_MajorTopic == "Y"]
mesh_male_majorN <- mesh_descriptor[DescriptorName == "Male" & Descriptor_MajorTopic == "N"]
mesh_male_majorY_pmid <- unique(mesh_male_majorY$PMID)
mesh_male_majorN_pmid <- unique(mesh_male_majorN$PMID)
mesh_male_major_pmid <- append(mesh_male_majorY_pmid, mesh_male_majorN_pmid)

binary_male_majorY <- as.numeric(rep(1, length(mesh_male_majorY_pmid)))
binary_male_majorN <- as.numeric(rep(0, length(mesh_male_majorN_pmid)))
binary_male_major <- append(binary_male_majorY, binary_male_majorN)
article_male_major <- as.data.table(cbind(mesh_male_major_pmid, binary_male_major))
setnames(article_male_major, "mesh_male_major_pmid", "PMID")

article_male <- merge.data.table(article_male, article_male_major, by = "PMID", all = T)

# Add the above variables into the main dataset
merge1 <- merge.data.table(author_article, article_female, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, article_male, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, A, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, B, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, C, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, D, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, E, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, F, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, G, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, H, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, I, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, J, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, K, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, L, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, M, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, N, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, Z, by = "PMID", all.x = T)
merge1 <- merge.data.table(merge1, missing, by = "PMID", all.x = T)

merge1[is.na(missing) & is.na(binary_female), binary_female:= 0]
merge1[is.na(missing) & is.na(binary_male), binary_male:= 0]
merge1[is.na(missing) & is.na(binary_A), binary_A:= 0]
merge1[is.na(missing) & is.na(binary_B), binary_B:= 0]
merge1[is.na(missing) & is.na(binary_C), binary_C:= 0]
merge1[is.na(missing) & is.na(binary_D), binary_D:= 0]
merge1[is.na(missing) & is.na(binary_E), binary_E:= 0]
merge1[is.na(missing) & is.na(binary_F), binary_F:= 0]
merge1[is.na(missing) & is.na(binary_G), binary_G:= 0]
merge1[is.na(missing) & is.na(binary_H), binary_H:= 0]
merge1[is.na(missing) & is.na(binary_I), binary_I:= 0]
merge1[is.na(missing) & is.na(binary_J), binary_J:= 0]
merge1[is.na(missing) & is.na(binary_K), binary_K:= 0]
merge1[is.na(missing) & is.na(binary_L), binary_L:= 0]
merge1[is.na(missing) & is.na(binary_M), binary_M:= 0]
merge1[is.na(missing) & is.na(binary_N), binary_N:= 0]
merge1[is.na(missing) & is.na(binary_Z), binary_Z:= 0]

# Construct minority, majority, all female/male indicators
merge1 = merge1[, min_f:= as.numeric((fcount > 0) & (fcount < mcount))]
merge1 = merge1[, maj_f:= as.numeric((mcount > 0) & (fcount >= mcount))]
merge1 = merge1[, all_f:= as.numeric((fcount > 0) & (mcount == 0))]
merge1 = merge1[, all_m:= as.numeric((fcount == 0) & (mcount > 0))]

# Mesh FEs creation

descriptor <- fread("data_cleaning/intermeidate_data/descriptor_tree.csv")
mesh <- fread("data_cleaning/intermeidate_data/mesh_meta.csv")
mesh <- subset(mesh, select = c(PMID, DescriptorID, Descriptor_MajorTopic))

mesh_descriptor <- merge.data.table(mesh, descriptor, by.x = "DescriptorID", by.y = "UI", all.x = T)
mesh_descriptor <- subset(mesh_descriptor, select = -V1)

pmid_list <- unique(merge1$PMID)
mesh_descriptor[DescriptorID == "NULL", DescriptorID:= NA]
meshNoNA = subset(mesh_descriptor, select = -c(DescriptorName, Descriptor_MajorTopic))
meshNoNA = na.omit(meshNoNA, cols = "DescriptorID")
meshNoNA = meshNoNA[meshNoNA$PMID %in% pmid_list]
meshNoNA = unique(meshNoNA, by = c("PMID", "DescriptorID"))

mesh_long = melt(meshNoNA, id.vars = c("PMID", "DescriptorID"), value.name = "TreeNumber", na.rm = T)

mesh_long = subset(mesh_long, select = -variable)
mesh_long <- as.data.frame(mesh_long)

# Randomly assign the mesh rank by pmid (for feeding Rem's patent cleaning code (mesh fe's))

mesh_long = mesh_long %>% group_by(PMID) %>% mutate(rand_int=sample.int(n()))

pubmed_disease_fe <- 
  mesh_long %>%  
  filter(str_sub(TreeNumber,1,1) == "C", str_count(TreeNumber, "\\.") >  2) %>% 
  group_by(PMID, DescriptorID) %>% 
  slice_head(n=1) %>% 
  ungroup() %>% 
  mutate(
    mesh_tree = as.numeric(as.factor(DescriptorID)),
    mesh_rank = rand_int
  ) %>% 
  dplyr::select(PMID, mesh_tree, mesh_rank) %>% 
  arrange(PMID, mesh_rank) %>% 
  group_by(PMID) %>% 
  slice_head(n=10) %>% 
  mutate(m = row_number(mesh_rank)) %>% 
  ungroup() %>% 
  arrange(PMID, mesh_rank) %>% 
  dplyr::select(PMID, mesh_tree, m) %>% 
  spread(m, mesh_tree, sep="_", fill=0)

## Check the number of retained observations
length(unique(pubmed_disease_fe$PMID))
##[1] 4734706

## Check the total C level observations in the main data
sum(na.omit(merge1$binary_C))
## [1] 5616175, thus the above calculation is reasonable

pubmed_disease_fe <- as.data.table(pubmed_disease_fe)
pubmed2002_reg_meshC <- merge.data.table(merge1, pubmed_disease_fe, by = "PMID", all.x = T)

###  Dataset for publication related analyses with all publication type

fwrite(pubmed2002_reg_meshC, "data_cleaning/intermeidate_data/pubmed2002_reg_meshC.csv")

################################################################################
###                         For the main analyses
################################################################################

### read in all pubmed and patent text data
## read in 1976-2020 article meta data from the xmls
meta <- fread("data_cleaning/intermeidate_data/all_article_meta.csv") %>% as_tibble()

## first get rid of all pmids with duplicates. We are not sure about why
## duplicates are generated, but those are only 1,356 out of 30,419,056 pmids.
meta <- meta %>% filter(!(duplicated(pmid) | duplicated(pmid, fromLast = TRUE)))

##############   jcif merge    ###############
### read in the jcif, pubmed-mag xwalk, magid-journal xwalk data
# magid-journal xwalk
paperjournalid <- as_tibble(fread("data_cleaning/raw_data/paperjournalid.tsv", sep = "\t"))

# jcif
jcif <- as_tibble(fread("data_cleaning/raw_data/jcif.tsv", sep = "\t"))

# magid-pmid xwalk
cross <- fread("data_cleaning/raw_data/PaperExtendedAttributes.txt", sep = "\t")
cross <- as_tibble(cross)
colnames(cross) <- c("magid", "type", "pmid")
cross <- filter(cross, type == 2) %>% select(-type)

# remove leading zeros in pmid column
cross$pmid <- as.integer(str_remove(cross$pmid, "^0+"))

## build jcif for pubmed articles
# merge the 3 datasets above together and remove unnecessary columns
pubmed_jcif <- cross %>%
  inner_join(paperjournalid, by = c("magid" = "paperid")) %>%
  inner_join(select(filter(meta, year >= 2002), pmid, year, type), by = "pmid") %>%
  inner_join(jcif, by = c("journalid", "year")) %>%
  group_by(pmid) %>%
  filter(jcif == max(jcif)) %>% slice_head(n = 1) %>% ungroup()

###   filtering pubmed article types
paper_type <- meta %>% select(pmid, type, year) %>%
  filter(str_detect(type, "Journal Article"), year >= 2002)

## keep the needed publication types
paper_type <- paper_type %>% select(1:3) %>%
  mutate(if_only_art_jrnl = (type == "Journal Article"))
  filter(
    str_detect(type, 
               "Research Support, N.I.H., Extramural|Comparative Study|Case Reports|Randomized Controlled Trial|Research Support, U.S. Gov't, P.H.S.|Clinical Trial|Research Support, Non-U.S. Gov't Research Support, U.S. Gov't, Non-P.H.S.|Evaluation Study|Multicenter Study|Validation Study|Observational Study|Research Support, N.I.H., Intramural|Clinical Trial, Phase I|Clinical Trial, Phase II|Clinical Trial, Phase III|Controlled Clinical Trial"
    ) | if_only_art_jrnl
  )

### Full research article sample data
dat <- pubmed2002_reg_meshC %>% as_tibble() %>%
  select(-year) %>% 
  inner_join(paper_type, by = c("PMID" = "pmid")) %>% as.data.table()

dat <- dat[!is.na(binary_female)]
fwrite(dat, "/analysis/clean_data/article_data/pubmed2002_reg_meshC_types.csv")
  
### construct the top 1k JCIF data, which is the one used for our main analyses
dat$f100 <- dat$binary_female * 100
dat$m100 <- dat$binary_male * 100
dat$year <- as.numeric(as.factor(dat$year))
dat$teamsize <- as.numeric(as.factor(dat$teamsize))
dat$journal_id <- as.numeric(as.factor(dat$journal_id))
dat$all_f <- as.logical(dat$all_f)
dat$all_m <- as.logical(dat$all_m)
dat$min_f <- as.logical(dat$min_f)
dat$maj_f <- as.logical(dat$maj_f)

dat <- as_tibble(dat)

# read in the cleaned jcif data
pubmed_jcif <- pubmed_jcif %>%
  select(-magid, -type, -journalname, -year)

# merge to the main data
dat_jcif <- dat %>%
  inner_join(pubmed_jcif, by = c("PMID" = "pmid")) %>%
  mutate(journalid = as.numeric(as.factor(journalid)))

###   JCIF data check
jcif_check <- dat_jcif %>% 
  filter(year < 18) %>% 
  group_by(journalid) %>% summarise(cnt = n_distinct(year)) %>%
  filter(cnt == 17)

## only keep journals with jcif info every year from 2002 to 2018
dat_jcif <- dat_jcif %>%
  filter(journalid %in% jcif_check$journalid)
###

## function to make sample cut, then get the corresponding general regression result
jcif_cut_reg <- function(top_num) {
  # slice the top n journals by average jcif across years
  cut <- dat_jcif %>%
    group_by(year, journalid) %>%
    summarise(jcif = mean(jcif)) %>%
    group_by(journalid) %>%
    summarise(jcif = mean(jcif)) %>%
    slice_max(order_by = jcif, n = top_num) %>% ungroup() %>%
    select(-jcif)
  
  # use this data to filter out the less applied journal in the main data
  reg_dat <- dat_jcif %>%
    filter(journalid %in% cut$journalid)
  
  return(reg_dat)
}

## The dataset with articles from the top 1k journals by JCIF
dat_1k <- jcif_cut_reg(1000)

fwrite(dat_1k, "/analysis/clean_data/article_data/jcif_1k_regdat.csv")

################################################################################
###                       For JCIF robustness checks
################################################################################

### sample with research articles in the top 100 and 500 JCIF journals
dat_100 <- jcif_cut_reg(100)
dat_500 <- jcif_cut_reg(500)

fwrite(dat_100, "/analysis/clean_data/article_data/jcif_100_regdat.csv")
fwrite(dat_500, "/analysis/clean_data/article_data/jcif_500_regdat.csv")

################################################################################
###               For article gender focus prediction analyses
################################################################################

### Load the cleaned pubmed abstract and title text 
# (since this data is too large, we are not including it in our replication package)
txt_cleaned_all <- fread("txt_cleaned_all.csv") %>% as_tibble() %>%
  filter(PMID %in% dat$PMID)

# write it out for fasttext prediction
# (since this data is too large, we are not including it in our replication package)
fwrite(list(txt_cleaned_all$text), "abs_all_pred.txt", quote = F)

### Code we run in fasttext in the shell to get abstract gender focus prediction
## We use the 100-dimensionl model we trained fasttext_model.bin to build the
## prediction model (prediction model construction can be found in the patent code)

# For the female focus prediction, we use the supervised model supervised_model_female.bin
# ./fasttext predict-prob supervised_model_female.bin abs_all_pred.txt > abs_all_pred_female.csv
# 
# For the male focus prediction, we use the supervised model supervised_model_male.bin
# ./fasttext predict-prob supervised_model_male.bin abs_all_pred.txt > abs_all_pred_male.csv


### Load the article gender focus prediction results from fasttext
pred_female <- read_delim("data_cleaning/intermediate_data/abs_all_pred_female.csv", " ", col_names = c("label", "estimate")) %>%
  mutate(estimate = ifelse(label == "__label__All", 1 - estimate, estimate)) %>%
  select(estimate) %>%
  rename(female_trial_est = estimate) %>%
  mutate(PMID = txt_cleaned_all$PMID)

pred_male <- read_delim("data_cleaning/intermediate_data/abs_all_pred_male.csv", " ", col_names = c("label", "estimate")) %>%
  mutate(estimate = ifelse(label == "__label__All", 1 - estimate, estimate)) %>%
  select(estimate) %>%
  rename(male_trial_est = estimate) %>%
  mutate(PMID = txt_cleaned_all$PMID)

## load paper abstract info for examples output
paper_title <- fread("data_cleaning/intermediate_data/all_article_meta.csv") %>% as_tibble() %>%
  filter(pmid %in% txt_cleaned_all$PMID) %>%
  rename(PMID = pmid) %>%
  select(abstract, title, PMID)

## regression data
dat_pred <- 
  dat %>%
  left_join(pred_female, by = "PMID") %>%
  left_join(pred_male, by = "PMID") %>%
  left_join(txt_cleaned_all, by = "PMID") %>%
  group_by(PMID) %>%
  filter(row_number() == 1) %>% ungroup()

# add article title to it
dat_pred <- 
  dat_pred %>%
  left_join(select(paper_title, PMID, title, abstract), by = "PMID") %>%
  filter(PMID %in% dat_1k$PMID)

### write the data out for prediction regressions and analyses
fwrite(dat_pred, "/analysis/clean_data/article_data/dat_pred.csv")

################################################################################
###                   For disease gender incidence analyses
################################################################################

library(readstata13)

## load in all relevant data
# dta reading function
read_stata <- function(x) read.dta13(x) %>% as_tibble()

# disease incidence
gbd_health <- read_stata("data_cleaning/raw_data/gbd_incidence_2010.dta") %>%
  mutate(gbd_cause = gbd_cause %>%
           str_trim() %>%
           str_replace_all("\\s+", " ") %>%
           str_to_lower()) %>%
  filter(!is.na(incidence_female))

# mesh-disease xwalk
gbd_mesh <- read_stata("data_cleaning/raw_data/gbd_to_mesh_simple.dta") %>%
  mutate(gbd_cause = gbd_cause %>%
           str_trim() %>%
           str_replace_all("\\s+", " ") %>%
           str_to_lower(), 
         mesh_term = mesh_term %>%
           str_trim() %>%
           str_replace_all("\\s+", " ") %>%
           str_to_lower())

# get article year info
dat_year <- fread("/analysis/clean_data/article_data/pubmed2002_reg_meshC_types.csv") %>% as_tibble() %>%
  select(PMID, year)

# pubmed mesh data based on 1000 jcif
pubmed_mesh <- fread("data_cleaning/intermediate_data/mesh_meta.csv") %>% as_tibble() %>%
  select(PMID, DescriptorName) %>%
  filter(PMID %in% dat_1k$PMID) %>%
  mutate(DescriptorName = DescriptorName %>%
           str_trim() %>%
           str_replace_all("\\s+", " ") %>%
           str_to_lower()) %>%
  left_join(dat_year, by = "PMID")

## build the incidence level regression data

# merging
pubmed_incidence <- gbd_health %>% 
  select(gbd_cause, incidence_female, incidence_male) %>%
  inner_join(select(gbd_mesh, gbd_cause, mesh_term), by = "gbd_cause") %>%
  inner_join(
    pubmed_mesh, by = c("mesh_term" = "DescriptorName")
  ) %>%
  group_by(PMID, gbd_cause) %>%
  filter(row_number() == 1) %>% ungroup()

# regression data based on top 1k jcif sample
incidence_reg1k <- pubmed_incidence %>%
  group_by(PMID) %>%
  mutate(num_mesh = n()) %>%
  ungroup() %>%
  mutate(weights = 1/num_mesh) %>%
  select(-year) %>%
  left_join(dat, by = "PMID") %>%
  
  # build additional regression variables
  mutate(log_if = log(incidence_female+1),
         log_im = log(incidence_male+1)) %>%
  group_by(PMID, gbd_cause) %>%
  filter(row_number() == 1) %>% ungroup()

fwrite(incidence_reg1k, "/analysis/clean_data/article_data/incidence_reg1k.csv")